The data contains features extracted from the silhouette of vehicles in different angles. Four "Corgie" model vehicles were used for the experiment: a double decker bus, Cheverolet van, Saab 9000 and an Opel Manta 400 cars. This particular combination of vehicles was chosen with the expectation that the bus, van and either one of the cars would be readily distinguishable, but it would be more difficult to distinguish between the cars.
The purpose is to classify a given silhouette as one of three types of vehicle, using a set of features extracted from the silhouette. The vehicle may be viewed from one of many different angles.
## import packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn import metrics
from scipy.stats import zscore
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
## Importing data
vehicle = pd.read_csv('vehicle-1.csv')
vehicle.head()
vehicle.shape
vehicle.info();
vehicle.describe().T
vehicle.plot.kde(subplots =True, layout = (6,4), figsize = (15,18), sharex=False);
sns.pairplot(vehicle,hue = 'class');
### Plotting how many other columns is each feature highly correalted with
corr =vehicle.corr()[(abs(vehicle.corr())>0.9)& (abs(vehicle.corr())<1)]
fig, ax = plt.subplots(figsize = (10,8))
sns.barplot(y = corr.count().index, x =corr.count(), ax = ax)
ax.set(xlabel="# Features Highly Correlated With", ylabel='Feature Name');
ax = sns.countplot(vehicle['class'],order=vehicle['class'].value_counts().index)
plt.ylabel("% Records")
for p, label in zip(ax.patches,(round(100*vehicle['class'].value_counts(normalize=True),0))):
ax.annotate(label, (p.get_x()+0.3, p.get_height()+5))
The target variable is evenly distributed for 'Bus' and 'Van', however, there are majorly records for 'Car' class
## Identifying Columns with Null Values
vehicle.isnull().sum()[vehicle.isnull().sum()>0]
## Identifying the unique records with null records
print("Number of unique records with Null Values: ",vehicle[(vehicle.isnull().T.sum()>0).T].shape[0])
print("Records with Null Values -")
vehicle[(vehicle.isnull().T.sum()>0).T].head()
Since the number of records with missing values is less, we can replace them with the Median. We choose median since outliers are present in the data
PCA is highly impacted by outliers and it is vital to handle these
## Calculating how many outliers per column
def outlier_count(x):
upper = np.mean(x)+(2*np.std(x))
lower = np.mean(x)-(2*np.std(x))
y = x[(x>upper)|(x<lower)].count()
return y
vehicle.drop(['class'],axis=1).apply(lambda x:outlier_count(x))
##Visualizing Outliers
vehicle.plot.box(subplots =True, layout = (5,4), figsize = (15,20));
We have identified Null Values and Outliers in the dataset and will handle the same after splitting the data into Train and Test (Next Section), to avoid any data leakage while developing the model.
## Encoding Target Variable
li = LabelEncoder()
y_original = vehicle['class']
vehicle['class'] = li.fit_transform(np.array(vehicle['class']))
The following encoding scheme is applied- 0 - Bus 1 - Car 2 - Van
## Split data into train and test
X = vehicle.drop(['class'], axis =1)
y = vehicle['class']
X_train, X_test, y_train,y_test = train_test_split(X,y, test_size =0.3, random_state =10)
print(X_train.shape, X_test.shape, y_train.shape,y_test.shape)
## Replacing Outliers with Median
def outlier_replace(x):
repl = np.median(x)
upper = np.mean(x)+(2*np.std(x))
lower = np.mean(x)-(2*np.std(x))
x = np.where((x>upper)|(x<lower), repl,x)
return x
X_train1 =X_train.apply(lambda x:outlier_replace(x))
X_test1 =X_test.apply(lambda x:outlier_replace(x))
X1 = X.apply(lambda x:outlier_replace(x))
## Replacing Nulls with Median
si = SimpleImputer(strategy = "median")
X_train2 = si.fit_transform(X_train1)
X_test2 = si.fit_transform(X_test1)
X2 = si.fit_transform(X1)
## Scaling the dataset
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train2)
X_test_scaled = scaler.fit_transform(X_test2)
X_scaled = scaler.fit_transform(X2)
##Grid Search for best hyperparameter combination for our data
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=False)
x=grid.fit(X_train_scaled,y_train)
print(grid.best_estimator_)
Based on above grid search, we will use C=10 and gamma =0.1
## Creating a baseline SVM model using all raw features
svc = SVC(gamma =0.1 , C =10, kernel ='rbf',random_state =10)
model_allfeatures = svc.fit(X_train_scaled,y_train)
score = pd.DataFrame(index=['Accuracy'])
y_pred = svc.predict(X_test_scaled)
print ("The Test Accuracy = ",metrics.accuracy_score(y_pred, y_test)*100,"%")
score['Raw Features']= (metrics.accuracy_score(y_pred, y_test)*100)
## Checking for Cross Validation Score
cv = KFold(n_splits=5, random_state=10, shuffle=True)
score['Raw Features CV'] = 100*np.mean(cross_val_score(model_allfeatures, X_scaled, y, scoring='accuracy', cv=cv, n_jobs=-1))
print ("The CV Accuracy = ",score['Raw Features CV'][0],"%")
## Creating Principal Components
pca = PCA(random_state =10)
pca.fit(X_scaled)
## Exploring the load factors for each component
pca_components = pd.DataFrame(pca.components_,columns = X.columns)
pca_components
##Analysing Scree Plot to choose the right number of Principal Components
plt.bar(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'],pca.explained_variance_ratio_, align ='center');
## Plotting Cumulative Explained Variance Graph
plt.step(['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18'],
np.cumsum(pca.explained_variance_ratio_));
np.cumsum(pca.explained_variance_ratio_)
From the above charts it is evident that Top 9 principal components are covering ~95% of the Explained Variance and will use these for re-modelling
## Looking at the model accuracy value for every n_components value
x = list(range(0,19))
for i in range(2,19):
pca = PCA(n_components =i,random_state =10)
pca.fit(X_scaled)
X_scaled_reduced = pca.transform(X_scaled)
X_train_scaled_reduced = pca.transform(X_train_scaled)
X_test_scaled_reduced = pca.transform(X_test_scaled)
svc = SVC(gamma =0.1 , C =10, kernel ='rbf',random_state =10)
model_reducedfeatures = svc.fit(X_train_scaled_reduced,y_train)
y_pred = svc.predict(X_test_scaled_reduced)
x[i]= (metrics.accuracy_score(y_pred, y_test)*100)
sns.lineplot(x=range(2,19),y = x[2:19]);
plt.xlabel("Number of Components")
plt.ylabel("SVM Model Accuracy %");
The accuracy for 10 principal components is around 96%, beyond which the accuracy remains almost similar. We also know from above that 9 components cover ~95% variation in data. So 10 components also would cover more than 95% variation. Hence, we would go ahead with Top 10 principal components to rebuild our model
### Shortlisting components and recreating PCA - Selecting top 10
pca = PCA(n_components =10, random_state =10)
pca.fit(X_scaled)
X_scaled_reduced = pca.transform(X_scaled)
X_train_scaled_reduced = pca.transform(X_train_scaled)
X_test_scaled_reduced = pca.transform(X_test_scaled)
## Looking at their bivariate distribution
sns.pairplot(pd.DataFrame(X_scaled_reduced));
Observations
svc = SVC(gamma =0.1 , C =10, kernel ='rbf',random_state=10)
model_reducedfeatures = svc.fit(X_train_scaled_reduced,y_train)
y_pred = svc.predict(X_test_scaled_reduced)
print ("The Train Accuracy = ",metrics.accuracy_score(svc.predict(X_train_scaled_reduced),y_train)*100,"%")
print ("The Test Accuracy = ",metrics.accuracy_score(y_pred, y_test)*100,"%")
score['PCA Features']= (metrics.accuracy_score(y_pred, y_test)*100)
## Cross Validation Score for PCA features model
cv = KFold(n_splits=5, random_state=10, shuffle=True)
score['PCA Features CV'] = 100*np.mean(cross_val_score(model_reducedfeatures, X_scaled_reduced, y, scoring='accuracy', cv=cv, n_jobs=-1))
print ("The CV Accuracy = ",score['PCA Features CV'][0],"%")
score